{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature: Simple Summary Statistics"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Extract rudimentary statistical features, such as question lengths (in words and characters), differences and ratios of these lengths."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This utility package imports `numpy`, `pandas`, `matplotlib` and a helper `kg` module into the root namespace."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pygoose import *"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Config"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Automatically discover the paths to various data folders and compose the project structure."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "project = kg.Project.discover()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Identifier for storing these features on disk and referring to them later."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_list_id = 'simple_summaries'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Original question datasets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = pd.read_csv(project.data_dir + 'train.csv').fillna('')\n",
    "df_test = pd.read_csv(project.data_dir + 'test.csv').fillna('')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Preprocessed and tokenized questions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokens_train = kg.io.load(project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_train.pickle')\n",
    "tokens_test = kg.io.load(project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_test.pickle')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def word_difference_ratio(q1_tokens, q2_tokens):\n",
    "    return len(set(q1_tokens) ^ set(q2_tokens)) / (len(set(q1_tokens)) + len(set(q2_tokens)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_original_question_features(row):\n",
    "    q1 = row[0]\n",
    "    q2 = row[1]\n",
    "    \n",
    "    shorter_char_length = min(len(q1), len(q2))\n",
    "    longer_char_length = max(len(q1), len(q2))\n",
    "    \n",
    "    return [\n",
    "        np.log(shorter_char_length + 1),\n",
    "        np.log(longer_char_length + 1),\n",
    "        np.log(abs(longer_char_length - shorter_char_length) + 1),\n",
    "        shorter_char_length / longer_char_length,\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_tokenized_features(pair):\n",
    "    q1 = pair[0]\n",
    "    q2 = pair[1]\n",
    "    \n",
    "    shorter_token_length = min(len(q1), len(q2))\n",
    "    longer_token_length = max(len(q1), len(q2))\n",
    "    \n",
    "    return [\n",
    "        np.log(shorter_token_length + 1),\n",
    "        np.log(longer_token_length + 1),\n",
    "        np.log(abs(longer_token_length - shorter_token_length) + 1),\n",
    "        shorter_token_length / longer_token_length,\n",
    "        word_difference_ratio(q1, q2),\n",
    "    ]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Extract character-based features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\r",
      "Batches:   0%|          | 0/405 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n",
      "  \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Batches: 100%|██████████| 405/405 [00:28<00:00, 14.23it/s]\n"
     ]
    }
   ],
   "source": [
    "features_original_train = kg.jobs.map_batch_parallel(\n",
    "    df_train.as_matrix(columns=['question1', 'question2']),\n",
    "    item_mapper=extract_original_question_features,\n",
    "    batch_size=1000,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/Cellar/python3/3.6.3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n",
      "  \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Batches: 100%|██████████| 2346/2346 [02:26<00:00, 15.98it/s]\n"
     ]
    }
   ],
   "source": [
    "features_original_test = kg.jobs.map_batch_parallel(\n",
    "    df_test.as_matrix(columns=['question1', 'question2']),\n",
    "    item_mapper=extract_original_question_features,\n",
    "    batch_size=1000,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Extract token-based features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Batches: 100%|██████████| 405/405 [00:48<00:00,  8.29it/s]\n"
     ]
    }
   ],
   "source": [
    "features_tokenized_train = kg.jobs.map_batch_parallel(\n",
    "    tokens_train,\n",
    "    item_mapper=extract_tokenized_features,\n",
    "    batch_size=1000,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Batches: 100%|██████████| 2346/2346 [05:52<00:00,  6.66it/s]\n"
     ]
    }
   ],
   "source": [
    "features_tokenized_test = kg.jobs.map_batch_parallel(\n",
    "    tokens_test,\n",
    "    item_mapper=extract_tokenized_features,\n",
    "    batch_size=1000,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Combine features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = np.hstack([features_original_train, features_tokenized_train])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test = np.hstack([features_original_test, features_tokenized_test])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X_train: (404290, 9)\n",
      "X_test:  (2345796, 9)\n"
     ]
    }
   ],
   "source": [
    "print('X_train:', X_train.shape)\n",
    "print('X_test: ', X_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_names = [\n",
    "    # Character features.\n",
    "    'shorter_char_len_log',\n",
    "    'longer_char_len_log',\n",
    "    'char_len_diff_log',\n",
    "    'char_len_ratio',\n",
    "    \n",
    "    # Token features.\n",
    "    'shorter_token_len_log',\n",
    "    'longer_token_len_log',\n",
    "    'token_len_diff_log',\n",
    "    'token_len_ratio',\n",
    "    'word_diff_ratio',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "project.save_features(X_train, X_test, feature_names, feature_list_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
